
library(dplyr)
library(haven)
library(readr)
library(assertr)
library(tidyr)

setwd('./NC Data')
rawdata_path = 'raw_data_from_ncerdc'

both_crs = c('0000')
math_crs = setdiff(paste(2000:2500),c('2016','2060'))
read_crs = setdiff(paste(1000:1038),c('1014','1018','1025','1029','1031','1032'))

## Read raw data for years 2008 to 2014
CRS = setNames(vector(mode="list", length=7), 2008:2014)
SCH = setNames(vector(mode="list", length=7), 2008:2014)

for (yr in 2008:2014){
  
  cat('Converting .dta file for year',yr,'\n')

  dta_file = sprintf('crs_memb_pub%s.dta',yr)
  
  D = read_dta(file.path(rawdata_path,dta_file),encoding="latin1") %>%
    filter(!is.na(mastid)) %>%
    rename(any_of(c(year = 'reporting_year', statecourse = 'coursecode',
                    numstudents = 'nstudents'))) %>%
    mutate(across(c("mastid","year","teachid"),as.numeric),
           statecourse_prefix = substr(statecourse,1,4),
           sbj = case_when(statecourse_prefix %in% both_crs ~ 0,
                           statecourse_prefix %in% read_crs ~ 1,
                           statecourse_prefix %in% math_crs ~ 2,
                           TRUE ~ NA)) %>%
    select(-c(birthdt,sex,ethnic,grade))
  
  SCH[[paste(yr)]] = D %>%
    select(year, lea, schlcode, mastid) %>%
    distinct()
  
  CRS[[paste(yr)]] = D %>%
    filter(!is.na(teachid) & !is.na(sbj)) %>%
    group_by(across(-c(mastid))) %>%
    mutate(crsid = cur_group_id()) %>%
    ungroup() %>%
    distinct() %>%
    select(year,lea,schlcode,sbj,teachid,crsid,mastid)
  
}

SCH = bind_rows(SCH)
CRS = bind_rows(CRS)

TCH = CRS %>%
  select(year,lea,schlcode,mastid,sbj,teachid) %>%
  group_by(year,lea,schlcode,mastid,sbj) %>%
  mutate(teachid = replace(teachid, min(teachid)!=max(teachid), 0)) %>%
  filter(row_number()==1) %>%
  ungroup() %>%
  pivot_wider(names_from = 'sbj', names_prefix = 'sbj_', values_from = 'teachid') %>%
  mutate(teachid_reading = case_when(!is.na(sbj_1) & sbj_1!=0 ~ sbj_1,
                                     is.na(sbj_1) & !is.na(sbj_0) & sbj_0!=0 ~ sbj_0,
                                     TRUE ~ NA),
         teachid_math = case_when(!is.na(sbj_2) & sbj_2!=0 ~ sbj_2,
                                  is.na(sbj_2) & !is.na(sbj_0) & sbj_0!=0 ~ sbj_0,
                                  TRUE ~ NA)) %>%
  select(year,lea,schlcode,mastid,teachid_math,teachid_reading)
         
# Combine score data and school data
# set score to missing if student appears in multiple schools in year
# set teacherid to missing if teacher has more than 200 students in year
# drop charter schools
data = read_rds('student_scores.rds') %>%
  inner_join(SCH, by = c('year','mastid')) %>%
  left_join(TCH, by = c('year','lea','schlcode','mastid')) %>%
  group_by(year, mastid) %>%
  mutate(ma_score = replace(ma_score, n()>1, NA),
         rd_score = replace(rd_score, n()>1, NA)) %>%
  group_by(year, teachid_reading) %>%
  mutate(teachid_reading = replace(teachid_reading, n()>200, NA)) %>%
  group_by(year, teachid_math) %>%
  mutate(teachid_math = replace(teachid_math, n()>200, NA)) %>%
  ungroup() %>%
  filter(!stringr::str_detect(lea,'^\\d+[:alpha:]$'))

write_rds(data,'student_all.rds')
